#link: https://www.kaggle.com/abcsds/pokemon
pokemon <- read.csv("Pokemon.csv")
library(ggplot2)
library(car) #gives LeveneTest
library(EnvStats) #for Box-Cox transformations
library(MASS) # for model selecting
library(plotly)
library(knitr)
library(forcats)
library(rcompanion)
library(moments)
library(magick)
library(grid)
#we must convert Generation into a factor
pokemon$Generation <- as.factor(pokemon$Generation)
str(pokemon)
## 'data.frame': 800 obs. of 13 variables:
## $ X. : int 1 2 3 3 4 5 6 6 6 7 ...
## $ Name : Factor w/ 800 levels "Abomasnow","AbomasnowMega Abomasnow",..: 81 330 746 747 103 104 100 101 102 666 ...
## $ Type.1 : Factor w/ 18 levels "Bug","Dark","Dragon",..: 10 10 10 10 7 7 7 7 7 18 ...
## $ Type.2 : Factor w/ 19 levels "","Bug","Dark",..: 15 15 15 15 1 1 9 4 9 1 ...
## $ Total : int 318 405 525 625 309 405 534 634 634 314 ...
## $ HP : int 45 60 80 80 39 58 78 78 78 44 ...
## $ Attack : int 49 62 82 100 52 64 84 130 104 48 ...
## $ Defense : int 49 63 83 123 43 58 78 111 78 65 ...
## $ Sp..Atk : int 65 80 100 122 60 80 109 130 159 50 ...
## $ Sp..Def : int 65 80 100 120 50 65 85 85 115 64 ...
## $ Speed : int 45 60 80 80 65 80 100 100 100 43 ...
## $ Generation: Factor w/ 6 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Legendary : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
#notice that mega pokemon are counted as their own (e.g., AbomasnowMega)
kable(head(pokemon,15))
| X. | Name | Type.1 | Type.2 | Total | HP | Attack | Defense | Sp..Atk | Sp..Def | Speed | Generation | Legendary |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Bulbasaur | Grass | Poison | 318 | 45 | 49 | 49 | 65 | 65 | 45 | 1 | False |
| 2 | Ivysaur | Grass | Poison | 405 | 60 | 62 | 63 | 80 | 80 | 60 | 1 | False |
| 3 | Venusaur | Grass | Poison | 525 | 80 | 82 | 83 | 100 | 100 | 80 | 1 | False |
| 3 | VenusaurMega Venusaur | Grass | Poison | 625 | 80 | 100 | 123 | 122 | 120 | 80 | 1 | False |
| 4 | Charmander | Fire | 309 | 39 | 52 | 43 | 60 | 50 | 65 | 1 | False | |
| 5 | Charmeleon | Fire | 405 | 58 | 64 | 58 | 80 | 65 | 80 | 1 | False | |
| 6 | Charizard | Fire | Flying | 534 | 78 | 84 | 78 | 109 | 85 | 100 | 1 | False |
| 6 | CharizardMega Charizard X | Fire | Dragon | 634 | 78 | 130 | 111 | 130 | 85 | 100 | 1 | False |
| 6 | CharizardMega Charizard Y | Fire | Flying | 634 | 78 | 104 | 78 | 159 | 115 | 100 | 1 | False |
| 7 | Squirtle | Water | 314 | 44 | 48 | 65 | 50 | 64 | 43 | 1 | False | |
| 8 | Wartortle | Water | 405 | 59 | 63 | 80 | 65 | 80 | 58 | 1 | False | |
| 9 | Blastoise | Water | 530 | 79 | 83 | 100 | 85 | 105 | 78 | 1 | False | |
| 9 | BlastoiseMega Blastoise | Water | 630 | 79 | 103 | 120 | 135 | 115 | 78 | 1 | False | |
| 10 | Caterpie | Bug | 195 | 45 | 30 | 35 | 20 | 20 | 45 | 1 | False | |
| 11 | Metapod | Bug | 205 | 50 | 20 | 55 | 25 | 25 | 30 | 1 | False |
#we make tables to get an idea of the amount of pokemon by each type
type.table <- table(pokemon$Type.1, pokemon$Type.2)
kable(type.table)
| Bug | Dark | Dragon | Electric | Fairy | Fighting | Fire | Flying | Ghost | Grass | Ground | Ice | Normal | Poison | Psychic | Rock | Steel | Water | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Bug | 17 | 0 | 0 | 0 | 2 | 0 | 2 | 2 | 14 | 1 | 6 | 2 | 0 | 0 | 12 | 0 | 3 | 7 | 1 |
| Dark | 10 | 0 | 0 | 3 | 0 | 0 | 2 | 3 | 5 | 2 | 0 | 0 | 2 | 0 | 0 | 2 | 0 | 2 | 0 |
| Dragon | 11 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 6 | 0 | 0 | 5 | 3 | 0 | 0 | 4 | 0 | 0 | 0 |
| Electric | 27 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 5 | 1 | 1 | 0 | 1 | 2 | 0 | 0 | 0 | 3 | 1 |
| Fairy | 15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Fighting | 20 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 2 | 0 |
| Fire | 28 | 0 | 0 | 1 | 0 | 0 | 7 | 0 | 6 | 0 | 0 | 3 | 0 | 2 | 0 | 2 | 1 | 1 | 1 |
| Flying | 2 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Ghost | 10 | 0 | 1 | 2 | 0 | 0 | 0 | 3 | 2 | 0 | 10 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 |
| Grass | 33 | 0 | 3 | 1 | 0 | 2 | 3 | 0 | 5 | 0 | 0 | 1 | 3 | 0 | 15 | 2 | 0 | 2 | 0 |
| Ground | 13 | 0 | 3 | 2 | 1 | 0 | 0 | 1 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 1 | 0 |
| Ice | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 3 | 0 | 0 | 0 | 2 | 0 | 0 | 3 |
| Normal | 61 | 0 | 0 | 0 | 0 | 5 | 2 | 0 | 24 | 0 | 2 | 1 | 0 | 0 | 0 | 2 | 0 | 0 | 1 |
| Poison | 15 | 1 | 3 | 1 | 0 | 0 | 2 | 0 | 3 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| Psychic | 38 | 0 | 1 | 0 | 0 | 6 | 3 | 1 | 6 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Rock | 9 | 2 | 2 | 2 | 0 | 3 | 1 | 0 | 4 | 0 | 2 | 6 | 2 | 0 | 0 | 2 | 0 | 3 | 6 |
| Steel | 5 | 0 | 0 | 1 | 0 | 3 | 1 | 0 | 1 | 4 | 0 | 2 | 0 | 0 | 0 | 7 | 3 | 0 | 0 |
| Water | 59 | 0 | 6 | 2 | 2 | 2 | 3 | 0 | 7 | 2 | 3 | 10 | 3 | 0 | 3 | 5 | 4 | 1 | 0 |
t1.table <- table(pokemon$Type.1)
kable(t1.table)
| Var1 | Freq |
|---|---|
| Bug | 69 |
| Dark | 31 |
| Dragon | 32 |
| Electric | 44 |
| Fairy | 17 |
| Fighting | 27 |
| Fire | 52 |
| Flying | 4 |
| Ghost | 32 |
| Grass | 70 |
| Ground | 32 |
| Ice | 24 |
| Normal | 98 |
| Poison | 28 |
| Psychic | 57 |
| Rock | 44 |
| Steel | 27 |
| Water | 112 |
t2.table <- table(pokemon$Type.2)
kable(t2.table)
| Var1 | Freq |
|---|---|
| 386 | |
| Bug | 3 |
| Dark | 20 |
| Dragon | 18 |
| Electric | 6 |
| Fairy | 23 |
| Fighting | 26 |
| Fire | 12 |
| Flying | 97 |
| Ghost | 14 |
| Grass | 25 |
| Ground | 35 |
| Ice | 14 |
| Normal | 4 |
| Poison | 34 |
| Psychic | 33 |
| Rock | 14 |
| Steel | 22 |
| Water | 14 |
| Above, we h | ave a table showing the number of pokemon with an interaction between Type 1 and Type 2, type 1, and type 2. |
#see if generations have different numbers of Type 1's and base stat totals
ggplot(pokemon, aes(Type.1, Total, color = Generation)) + geom_boxplot() + theme(plot.subtitle = element_text(vjust = 1),
plot.caption = element_text(vjust = 1),
panel.grid.major = element_line(linetype = "blank"),
panel.background = element_rect(fill = "thistle1"),
plot.background = element_rect(fill = "cyan2")) +labs(title = "Stat Total by Type 1 and Generation",
x = "Type 1")
#now type 2:
ggplot(pokemon, aes(Type.2, Total, color = Generation)) + geom_boxplot() + theme(plot.subtitle = element_text(vjust = 1),
plot.caption = element_text(vjust = 1),
panel.grid.major = element_line(linetype = "blank"),
panel.background = element_rect(fill = "rosybrown1"),
plot.background = element_rect(fill = "aquamarine1")) +labs(title = "Stat Total by Type 2 and Generation",
x = "Type 2")
#view avg BST by generation in a plot
bst.by.gen <- aggregate(Total ~ Generation, data = pokemon, FUN = mean)
bst.by.gen.plot <- ggplot(bst.by.gen, aes(x=Generation, y=Total, group = 1)) + geom_point(col = "white") + geom_line(color="red")+ theme(plot.subtitle = element_text(vjust = 1),
plot.caption = element_text(vjust = 1),
panel.grid.major = element_line(linetype = "blank"),
panel.grid.minor = element_line(linetype = "blank"),
panel.background = element_rect(fill = "black"),
plot.background = element_rect(fill = "red")) + ylab("Average BST") + ggtitle("Average BST by Generation")
bst.by.gen.plot
#interactive version:
ggplotly(bst.by.gen.plot)
#
ggplot(pokemon, aes(Total, color=Generation, fill=Generation)) + geom_histogram() + theme(plot.subtitle = element_text(vjust = 1),
plot.caption = element_text(vjust = 1),
panel.background = element_rect(fill = "honeydew1"),
plot.background = element_rect(fill = "lightsteelblue1"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
### Plot frequency of each type
#fct_infreq orders from least to greatest for type I
ggplot(pokemon, aes(x=fct_infreq(Type.1))) + geom_bar(fill = "aquamarine", color = "red") + theme(plot.subtitle = element_text(vjust = 1),
plot.caption = element_text(vjust = 1),
panel.grid.minor = element_line(linetype = "blank"),
panel.background = element_rect(fill = "thistle1"),
plot.background = element_rect(fill = "orange1"))+labs(title = "Frequency of each Type I",
x = "Type I", y = "Count")
ggplot(pokemon, aes(x=fct_infreq(Type.2))) + geom_bar(fill = "aquamarine", color = "red") + theme(plot.subtitle = element_text(vjust = 1),
plot.caption = element_text(vjust = 1),
panel.grid.minor = element_line(linetype = "blank"),
panel.background = element_rect(fill = "ivory"),
plot.background = element_rect(fill = "tomato2"))+labs(title = "Frequency of each Type II",
x = "Type II", y = "Count")+labs(caption = "*Note that the empty label means no secondary type")
### Histograms for each stat by Generation
#make histograms
#hp
hist.hp <- ggplot(pokemon, aes(x=HP)) + geom_histogram(col = "black", fill = "red") + facet_grid(.~Generation) + ggtitle("Distribution of HP stat by Generation")
#atk
hist.atk <- ggplot(pokemon, aes(x=Attack)) + geom_histogram(col = "red", fill = "blue") + facet_grid(.~Generation) + ggtitle("Distribution of Attack stat by Generation")
#def
hist.def <- ggplot(pokemon, aes(x=Defense)) + geom_histogram(col = "blue", fill = "aquamarine") + facet_grid(.~Generation) + ggtitle("Distribution of Defense stat by Generation")
#sp atk
hist.spatk <- ggplot(pokemon, aes(x=Sp..Atk)) + geom_histogram(col = "grey", fill = "yellow") + facet_grid(.~Generation) + ggtitle("Distribution of Special Attack stat by Generation")
#sp def
hist.spdef <- ggplot(pokemon, aes(x=Sp..Def)) + geom_histogram(col = "blue", fill = "royalblue2") + facet_grid(.~Generation) + ggtitle("Distribution of Special Defense stat by Generation")
#spd
hist.spd <- ggplot(pokemon, aes(x=Speed)) + geom_histogram(col = "yellow", fill = "blue") + facet_grid(.~Generation) + ggtitle("Distribution of Speed stat by Generation")
#######
ggplotly(hist.hp)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(hist.atk)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(hist.def)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(hist.spatk)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(hist.spdef)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(hist.spd)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#notice how they are all approximately normal and may contain some outliets
hist.bst <- ggplot(pokemon, aes(x=Total)) + geom_histogram(col = "green", fill = "purple") + facet_grid(.~Generation) + ggtitle("Distribution of Base Stat Total (BST) by Generation") + theme(plot.subtitle = element_text(vjust = 1),
plot.caption = element_text(vjust = 1),
panel.background = element_rect(fill = "honeydew1"),
plot.background = element_rect(fill = "antiquewhite"))
ggplotly(hist.bst)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The histogram suggests BST is approximately normally distributed across all generations.
pokemon.lm.model <- lm(Total ~ Generation, data = pokemon)
pokemon.aov <- aov(pokemon.lm.model)
summary(pokemon.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## Generation 5 110928 22186 1.547 0.173
## Residuals 794 11387586 14342
#so no significant difference between at least one generation for BST
plot(pokemon.aov)
#our residuals look random, errors look normal, so this is good
#########################################
#One-Way ANOVA on Legendary and BST
legendary.model <- lm(Total ~ Legendary, data = pokemon)
legendary.aov <- aov(legendary.model)
summary(legendary.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## Legendary 1 2894883 2894883 268.5 <2e-16 ***
## Residuals 798 8603631 10781
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#This suggest legendaries have a higher BST than non-legendaries
ggplot(pokemon, aes(pokemon.lm.model$residuals)) + geom_histogram(color = "seagreen2") + xlab("Residuals")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
This plot suggests our errors are skewed left, which violates the normality assumption of the residuals.
resid <- pokemon.lm.model$residuals
shapiro.wilk.test <- shapiro.test(resid)
shapiro.wilk.test
##
## Shapiro-Wilk normality test
##
## data: resid
## W = 0.9833, p-value = 6.529e-08
#so normality assumption is violated
brown.fors.test <- leveneTest(Total ~ Generation, data = pokemon, center = median)
brown.fors.test
#so equal variance by group condition is met
These results mean that although our equal variance condition is met for any reasonable value of \(\alpha\), our errors are not approximately normally distributed. Therefore, we will consider transforming our variables to satisfy ANOVA assumptions.
#check skew of response
skewness(pokemon$Total)
## [1] 0.1522438
#try square rooting the Response Variable
transform.pok <- lm(sqrt(Total) ~ Generation, data = pokemon)
plot(transform.pok)
resid2 <- transform.pok$residuals
shapiro.wilk.test2 <- shapiro.test(resid2)
shapiro.wilk.test2
##
## Shapiro-Wilk normality test
##
## data: resid2
## W = 0.98315, p-value = 5.748e-08
#so assumptions not met
#now try log(10)
transform.pok2 <- lm(log10(Total) ~ Generation, data = pokemon)
plot(transform.pok2)
resid3 <- transform.pok2$residuals
shapiro.wilk.test3 <- shapiro.test(resid3)
shapiro.wilk.test3
##
## Shapiro-Wilk normality test
##
## data: resid3
## W = 0.97164, p-value = 2.379e-11
#assumptions not met
Since transformations did not satisfy ANOVA assumptions, we will use non-parametric methods for our hypothesis test.
#add a rank column
pokemon$Rank = rank(pokemon$Total, ties = "average")
#get F-stat
F.OBS = summary(lm(Total ~ Generation, data = pokemon))$fstatistic["value"]
permuted.data = pokemon #So we don't overwrite the original data
permuted.data$Generation = sample(permuted.data$Generation, nrow(permuted.data), replace = FALSE) #Permuting the groups
Fi = summary(lm(Total ~ Generation, data = permuted.data))$fstatistic["value"]
#we will permute the data 3000 times
R = 3000
many.perms = sapply(1:R,function(i){
permuted.data = pokemon #So we don't overwrite the original data
permuted.data$Generation = sample(permuted.data$Generation, nrow(permuted.data), replace = FALSE) #Permuting the groups
Fi = summary(lm(Total ~ Generation, data = permuted.data))$fstatistic["value"]
return(Fi)
})
hist(many.perms, main = "Distribution for Permuted F values", xlab = "Fi")
points(y = 0, x = F.OBS, pch = 17)
#we have a distribution that is skewed right
#get p-val:
mean(many.perms >= F.OBS)
## [1] 0.1783333
#try w/ Kruskall-Wallis test to compare:
nt <- nrow(pokemon)
Ri <- aggregate(Rank ~ Generation, data = pokemon, mean)$Rank
SR.2 = var(pokemon$Rank)
ni = aggregate(Total ~ Generation, data = pokemon, length)$Total
KW.OBS = 1/SR.2*sum(ni*(Ri - (nt+1)/2)^2) #Note, this assumes you calculate ni and Ri above
R = 3000
many.perms.KW = sapply(1:R,function(i){
permuted.data = pokemon #So we don't overwrite the original data
permuted.data$Generation = sample(permuted.data$Generation, nrow(permuted.data), replace = FALSE) #Permuting the groups
SR.2 = var(permuted.data$Rank)
ni = aggregate(Rank ~ Generation, data = permuted.data,length)$Rank
Ri = aggregate(Rank ~ Generation, data = permuted.data,mean)$Rank
KW.i= 1/SR.2*sum(ni*(Ri - (nt+1)/2)^2)
return(KW.i)
})
#p-val for KW:
p.value = mean(many.perms.KW > KW.OBS)
p.value
## [1] 0.09633333
Both of these non-parametric tests lead to the conclusion that we cannot conclude the distribution of total (BST) is different for at least one generation.